#!/usr/bin/env ruby
# build_meta --- generator of parser tables for SPARQL::Grammar::Parser
#     Derived from: 
#        http://www.w3.org/2000/10/swap/grammar/predictiveParser.py
#        - predictiveParser.py, Tim Berners-Lee, 2004
#
require 'rubygems'
$:.unshift(File.expand_path(File.join(File.dirname(__FILE__), "..", 'lib')))
require 'rdf/n3'
require 'getoptlong'

# Build rdf/n3/parser/meta.rb from http://www.w3.org/2000/10/swap/grammar/n3-selectors.n3

class BNF < RDF::Vocabulary("http://www.w3.org/2000/10/swap/grammar/bnf#"); end
class REGEX < RDF::Vocabulary("http://www.w3.org/2000/10/swap/grammar/regex#"); end
class N3 < RDF::Vocabulary("http://www.w3.org/2000/10/swap/grammar/n3#"); end

class PredictiveParser
  attr_accessor :already, :agenda, :errors, :literalTerminals, :branchTable, :tokenRegexps
  attr_accessor :graph
  
  def initialize
    @already = []
    @agenda = []
    @errors = []
    @literalTerminals = {}
    @branchTable = {}
    @tokenRegexps = {}
  end

  def parse(file)
    progress("Loading " + file)
    @graph = RDF::Graph.load(file)
    progress("Loaded #{@graph.count} statements.")
  end

  def recordError(str)
    errors << str
    "##### ERROR:  #{str}"
  end

  def progress(str); puts(str); end
  def chatty(str); progress(str) if $verbose; end

  def runProduction(lhs)
    doProduction(lhs)
    while !@agenda.empty?
      x = @agenda.shift
      @already << x
      doProduction(x)
    end

    if !@errors.empty?
      progress("###### FAILED with #{errors.length} errors.")
      @errors.each {|s| progress("  #{s}")}
      exit(-2)
    else
      progress("Ok for predictive parsing")
    end 
  end
  
  # Generate branch tables for one production
  def doProduction(lhs)
    if lhs == BNF.void
      progress("\nvoid")
      return
    end
    if lhs == BNF.eof
      progress( "\nEOF")
      return
    end
    if lhs.is_a?(RDF::Literal)
      literalTerminals[lhs.value()] = 1
      return
    end

    branchDict = {}

    rhs = graph.first_object(subject: lhs, predicate: BNF.matches)
    if rhs
      chatty("\nToken #{lhs} matches regexp #{rhs}")
      tokenRegexps[lhs] = rhs.value

      cc = graph.query(subject: lhs, predicate: BNF.canStartWith)
      progress(recordError("No record of what token #{lhs} can start with")) if cc.empty?
      cc.each {|statement| chatty("  Can start with: #{statement.object}")}
      return
    end
  
    rhs = graph.first_object(subject: lhs, predicate: BNF.mustBeOneSequence)
    unless rhs
        progress(recordError("No definition of #{lhs}"))
  #     raise RuntimeError("No definition of %s  in\n %s" %(`lhs`, `g`))
        return
    end

    options = rhs
    progress("\nProduction #{lhs} :: #{options}")
    graph.query(subject: lhs, predicate: BNF.canPrecede) do |statement|
      chatty("  Can precede '#{statement.object}'")
    end

    graph.query(subject: lhs, predicate: BNF.branch) do |statement|
      branch = statement.object
      sequence = graph.first_object(subject: statement.object, predicate: BNF.sequence)
      option = RDF::List.new(subject: sequence, graph: graph).to_a
      progress("  option: #{option}")
    
      option.each do |part|
        agenda << part unless already.include?(part) || agenda.include?(part)
      end
    
      conditions = graph.query(subject: branch, predicate: BNF.condition).map(&:object)
      if conditions.empty?
        progress(recordError("NO SELECTOR for #{lhs} option #{option}"))
        if option.empty?
          # Void case - the tricky one
          graph.pattern(subject: lhs, predicate: BNF.canPrecede) do |st|
            progress("      Can precede #{st.object}")
          end
        end
      end
    
      progress("    Conditions: #{conditions.to_a.map(&:to_s)}")
      conditions.each do |str1|
        if branchDict.has_key?(str1)
          progress(
              "Conflict: #{str1} is also the condition for #{branchDict[str1]}")
        end
        branchDict[str1] = option
      end
    end
  
    branchDict.keys.each do |str1|
      branchDict.keys.each do |str2|
        s1, s2 = str1.to_s, str2.to_s
        if (s1.index(s2) == 0 || s2.index(s1) == 0) && branchDict[str1] != branchDict[str2]
          progress("WARNING: for #{lhs}, #{str1} indicates #{branchDict[str1]}, but #{str2} indicates #{branchDict[str2]}")
        end
      end
    end
  
    branchTable[lhs] = branchDict
  end
  
  def litOrNot(value)
    (value.is_a?(RDF::Literal) ? "" : ":") + value.to_s.dump
  end
  
  def outputBranchTable(io, indent = 0)
    ind0 = '  ' * indent
    ind1 = ind0 + '  '
    ind2 = ind1 + '  '
    ind3 = ind2 + '  '
    io.puts "#{ind0}BRANCHES = {"
    branchTable.keys.sort_by(&:to_s).each do |prod|
      # Special case double, integer, and decimal to output just a numericliteral, due to a parser conflict
      next if prod.to_s =~ /numericliteral/
      io.puts "#{ind1}#{litOrNot(prod)} => {"
      branchTable[prod].keys.sort_by(&:to_s).each do |term|
        list = branchTable[prod][term].map {|t2| litOrNot(t2)}.join(",\n#{ind3}")
        io.puts "#{ind2}#{litOrNot(term)} => [#{list}],"
      end
      io.puts "#{ind1}},"
    end
    io.puts "#{ind0}}\n"
  end

  def outputRegexpTable(io, indent = 0)
    ind0 = '  ' * indent
    ind1 = ind0 + '  '
    io.puts "#{ind0}REGEXPS = {"
    tokenRegexps.keys.sort_by(&:to_s).each do |prod|
      # Special case double, integer, and decimal to output just a numericliteral, due to a parser conflict
      next if prod.to_s =~ /(integer|double|decimal)/
      io.puts "#{ind1}#{litOrNot(prod)} => Regexp.compile(" +
      case prod.to_s
      when /barename/     then  %q(%(^[#{BARENAME_START}][#{BARENAME_TAIL}]*))
      when /explicituri/  then  %q("^<[^>]*>")
      when /langcode/     then  %q("^[a-zA-Z]+(-[a-zA-Z0-9]+)*")
      when /prefix/       then  %q(%(^([#{BARENAME_START}][#{BARENAME_TAIL}]*)?:))
      when /qname/        then  %q(%(^(([#{BARENAME_START}][#{BARENAME_TAIL}]*)?:)?([#{BARENAME_START}][#{BARENAME_TAIL}]*)?))
      when /variable/     then  %q(%(^\\\\?[#{BARENAME_START}][#{BARENAME_TAIL}]*))
      else                      tokenRegexps[prod].dump
      end + "),"
    end

    io.puts "\n#{ind1}# Hack to replace integer|double|decimal with numericliteral"
    io.puts "#{ind1}#{litOrNot(N3.numericliteral)} => Regexp.compile(" + %q(%(^[-+]?[0-9]+(\\\\.[0-9]+)?(e[-+]?[0-9]+)?)) + ")"
    io.puts "#{ind0}}\n"
  end
end

$verbose = false
$debug = false
grammarFile = File.expand_path(File.join(File.dirname(__FILE__), "../lib/rdf/n3/reader/n3-selectors.n3"))
start = N3.document
output = STDOUT

opts = GetoptLong.new(
  ["--debug", GetoptLong::NO_ARGUMENT],
  ["--verbose", GetoptLong::NO_ARGUMENT],
  ["--grammar", GetoptLong::REQUIRED_ARGUMENT],
  ["--start", GetoptLong::REQUIRED_ARGUMENT],
  ["--output", "-o", GetoptLong::REQUIRED_ARGUMENT],
  ["--help", "-?", GetoptLong::NO_ARGUMENT]
)
opts.each do |opt, arg|
  case opt
  when '--verbose' then $verbose = true
  when '--debug' then $debug = true
  when '--grammar' then grammarFile = arg
  when '--as' then parseAs = arg
  when '--output' then output = File.open(arg, "w")
  when '--help'
    puts %(Usage: build_meta --grammar=file --as=uri [--output=file]
        --grammar=file  This is the RDF augmented grammar
        --start=uri     This is the URI of the production as which the document
                        is to be parsed
        --output=file   Where to save output
)
    exit(0)
  end
end

pp = PredictiveParser.new

pp.parse(grammarFile)
pp.runProduction(start)

unless output == STDOUT
  output.puts "# This file is automatically generated by #{__FILE__}"
  output.puts "# Branch and Regexp tables derived from #{grammarFile}"
  output.puts "module RDF::N3::Meta"
end
pp.outputBranchTable(output, 1)
output.puts %q(
  if RUBY_VERSION >= "1.9.0"
    BARENAME_START = "A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\u{10000}-\u{effff}"
    BARENAME_TAIL = "0-9#{BARENAME_START}\u00b7\u0300-\u036f\u203f-\u2040\\\\-"
  else
    BARENAME_START = "A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\xff"
    BARENAME_TAIL = "0-9#{BARENAME_START}\xb7\\\\-"
  end
)
pp.outputRegexpTable(output, 1)
unless output == STDOUT
  output.puts "end"
end
